#!/usr/bin/env python3
#
#  Process the the relationship between journal subjects and their 
#    coverage breadth from GARUDA and SINTA
#  
#  Copyright 2025 Eko Didik Widianto <didik@live.undip.ac.id>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program. If not, see <https://www.gnu.org/licenses/>.
#

import sys
from pathlib import Path    # Path info
import re

import pandas as pd     # Pandas

# Using upsetplot and Seaborn
from upsetplot import from_memberships, UpSet

import matplotlib.pyplot as plt

# Path definition
path_abs = Path(__file__).parent.absolute() # Use absolute path
sys.path.append(f"{path_abs}/..") # Adds higher directory to python modules path
in_path = f"{path_abs}/data_master/"
report_path = f'{path_abs}/reporting/'

result_path = f'{path_abs}/results/'

# Own functions
from util.utils import save_file, open_json_file, save_aprofile

# Data file: Name, master journal data, master publisher data
data_file = [['Garuda', 'gjournal'],
  ['SINTA', 'sjournal']]

def main():
  
  print("Subjects relation")
  print("================")
  
  jprofiles = []
  
  cols = ['id', 'title', 'publisher', 'pissn', 'eissn', 'subj_areas']
  
  # Plots
  plt.rcParams['font.size'] = 9
  
  for i, mdata in enumerate(data_file):
    src, jour_src = mdata
    
    fig = plt.figure(figsize=(10, 6))
    
    jdf = open_json_file(in_path + jour_src + '.json')
    
    jdf = jdf[jdf['is_active']][cols]  # Filter active journal
    
    jdf['subj_areas'] = jdf['subj_areas'].fillna('').apply(list)
    jdf = jdf[jdf['subj_areas'].str.len() > 0].reset_index(drop=True)
    
    jdf_ex = jdf.explode('subj_areas').sort_values(by=['subj_areas']).reset_index(drop=True)
    subj_name = jdf_ex['subj_areas'].unique().tolist()    
    
    jdf_ = jdf_ex.groupby(['id']).agg({'subj_areas':list}).reset_index()
    
    jdf__ = jdf_.groupby(jdf_['subj_areas'].map(tuple)).agg({'id': 'count'}).reset_index()
    jdf__['subj_areas'] = jdf__['subj_areas'].map(list)
    
    subjs = jdf__['subj_areas'].to_list()
   
    counts = jdf__['id'].to_list()
    
    sets = from_memberships(subjs, data=counts)
    
    sets.to_csv(f'{result_path}/04-{jour_src}_subject.csv')
    
    upset = UpSet(sets, sort_by='cardinality', element_size=None, 
      max_subset_rank=15, 
      show_counts="{:,d}"#, show_percentages=True
      )
    
    upset.style_subsets(min_degree=2, facecolor="blue")
    upset.style_subsets(min_degree=3, facecolor="purple")
    upset.style_subsets(min_degree=4, facecolor="red")
    
    plot_res = upset.plot(fig=fig)
    plot_res["intersections"].set_ylabel("Subset size", fontsize=11)
    plot_res["intersections"].text(0.95, 0.85, f'Nss: {len(subjs):,d}\nNj: {sum(counts):,d}', 
      fontsize=11, ha = 'right', va='bottom',
      transform=plot_res["intersections"].transAxes)
    
    plot_res["totals"].set_xlabel("Subject size", fontsize=11)
    plot_res["matrix"].tick_params(axis='both', labelsize=10)
    
    # Reporting
    df_rep = pd.DataFrame(sets).reset_index().rename(columns={0:'count'}).sort_values(['count'], ascending=False)
    df_rep_count = df_rep['count'].sum()        
    df_rep['countp'] = df_rep['count'].apply(lambda x: round(100 * x / df_rep_count, 2))

    df_rep['num_subject'] = 0
    for subj in subj_name:
      df_rep['num_subject'] += 1 * df_rep[subj]
    
    save_file(df_rep, f'{result_path}/04-{jour_src}_subj-relation', 'Relation of journal subjects')
    
  plt.show()


# Run the script
if __name__ == '__main__':
  main()
